This file is designed to use CDC data to assess coronavirus disease burden by state, including creating and analyzing state-level cluters.
Through March 7, 2021, The COVID Tracking Project collected and integrated data on tests, cases, hospitalizations, deaths, and the like by state and date. The latest code for using this data is available in Coronavirus_Statistics_CTP_v004.Rmd.
The COVID Tracking Project suggest that US federal data sources are now sufficiently robust to be used for analyses that previously relied on COVID Tracking Project. This code is an attempt to update modules in Coronavirus_Statistics_CTP_v004.Rmd to leverage US federal data.
The code in this module builds on code available in _v001, and splits many functions in to two main .R files that can be sourced:
Broadly, the CDC data analyzed by this module includes:
The tidyverse package is loaded and functions are sourced:
# The tidyverse functions are routinely used without package::function format
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.1.1 v dplyr 1.0.6
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
# Functions are available in source file
source("./Generic_Added_Utility_Functions_202105_v001.R")
source("./Coronavirus_CDC_Daily_Functions_v001.R")
A series of mapping files are also available to allow for parameterized processing. Mappings include:
These default parameters are maintained in a separate .R file and can be sourced:
source("./Coronavirus_CDC_Daily_Default_Mappings_v002.R")
Additionally, a mapping file could be maintained to give default plotting labels to variables. This is currently not used by any of the sourced functions:
# Create a variable mapping file - this is currently redundant
varMapper <- c()
Code from the previous model is run, with results compared to previous results:
readList <- list("cdcDaily"="./RInputFiles/Coronavirus/CDC_dc_downloaded_210502.csv",
"cdcHosp"="./RInputFiles/Coronavirus/CDC_h_downloaded_210509.csv"
)
cdc_daily_compare <- readRunCDCDaily(thruLabel="May 2, 2021",
readFrom=readList,
compareFile=list("cdcDaily"=colRenamer(readFromRDS("dfRaw_dc_210414"),
c('new_case'='new_cases',
'tot_death'='tot_deaths',
'new_death'='new_deaths'
)
),
"cdcHosp"=readFromRDS("dfHosp_old")
),
writeLog="./RInputFiles/Coronavirus/Coronavirus_CDC_Daily_v002.log",
ovrwriteLog=TRUE,
dfPerCapita=NULL,
useClusters=readFromRDS("cdc_daily_test_v2")$useClusters,
skipAssessmentPlots=FALSE,
brewPalette="Paired"
)
##
## No file has been downloaded, will use existing file: ./RInputFiles/Coronavirus/CDC_dc_downloaded_210502.csv
##
## -- Column specification --------------------------------------------------------
## cols(
## submission_date = col_character(),
## state = col_character(),
## tot_cases = col_double(),
## conf_cases = col_double(),
## prob_cases = col_double(),
## new_case = col_double(),
## pnew_case = col_double(),
## tot_death = col_double(),
## conf_death = col_double(),
## prob_death = col_double(),
## new_death = col_double(),
## pnew_death = col_double(),
## created_at = col_character(),
## consent_cases = col_character(),
## consent_deaths = col_character()
## )
##
## *** File has been checked for uniqueness by: state date
##
##
## Checking for similarity of: column names
## In reference but not in current: naconf
## In current but not in reference:
##
## Checking for similarity of: date
## In reference but not in current: 0
## In current but not in reference: 18
## Detailed differences available in: ./RInputFiles/Coronavirus/Coronavirus_CDC_Daily_v002.log
##
## Checking for similarity of: state
## In reference but not in current:
## In current but not in reference:
##
##
## ***Differences of at least 5 and at least 5%
##
## 97 records
## Detailed output available in log: ./RInputFiles/Coronavirus/Coronavirus_CDC_Daily_v002.log
##
##
## ***Differences of at least 0 and at least 0.1%
##
## 14 records
## Detailed output available in log: ./RInputFiles/Coronavirus/Coronavirus_CDC_Daily_v002.log
##
##
## No file has been downloaded, will use existing file: ./RInputFiles/Coronavirus/CDC_h_downloaded_210509.csv
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## state = col_character(),
## date = col_date(format = ""),
## geocoded_state = col_character()
## )
## i Use `spec()` for the full column specifications.
##
## *** File has been checked for uniqueness by: state date
##
##
## Checking for similarity of: column names
## In reference but not in current:
## In current but not in reference: previous_day_admission_adult_covid_confirmed_18-19 previous_day_admission_adult_covid_confirmed_18-19_coverage previous_day_admission_adult_covid_confirmed_20-29 previous_day_admission_adult_covid_confirmed_20-29_coverage previous_day_admission_adult_covid_confirmed_30-39 previous_day_admission_adult_covid_confirmed_30-39_coverage previous_day_admission_adult_covid_confirmed_40-49 previous_day_admission_adult_covid_confirmed_40-49_coverage previous_day_admission_adult_covid_confirmed_50-59 previous_day_admission_adult_covid_confirmed_50-59_coverage previous_day_admission_adult_covid_confirmed_60-69 previous_day_admission_adult_covid_confirmed_60-69_coverage previous_day_admission_adult_covid_confirmed_70-79 previous_day_admission_adult_covid_confirmed_70-79_coverage previous_day_admission_adult_covid_confirmed_80+ previous_day_admission_adult_covid_confirmed_80+_coverage previous_day_admission_adult_covid_confirmed_unknown previous_day_admission_adult_covid_confirmed_unknown_coverage previous_day_admission_adult_covid_suspected_18-19 previous_day_admission_adult_covid_suspected_18-19_coverage previous_day_admission_adult_covid_suspected_20-29 previous_day_admission_adult_covid_suspected_20-29_coverage previous_day_admission_adult_covid_suspected_30-39 previous_day_admission_adult_covid_suspected_30-39_coverage previous_day_admission_adult_covid_suspected_40-49 previous_day_admission_adult_covid_suspected_40-49_coverage previous_day_admission_adult_covid_suspected_50-59 previous_day_admission_adult_covid_suspected_50-59_coverage previous_day_admission_adult_covid_suspected_60-69 previous_day_admission_adult_covid_suspected_60-69_coverage previous_day_admission_adult_covid_suspected_70-79 previous_day_admission_adult_covid_suspected_70-79_coverage previous_day_admission_adult_covid_suspected_80+ previous_day_admission_adult_covid_suspected_80+_coverage previous_day_admission_adult_covid_suspected_unknown previous_day_admission_adult_covid_suspected_unknown_coverage
##
## Checking for similarity of: date
## In reference but not in current: 0
## In current but not in reference: 15
## Detailed differences available in: ./RInputFiles/Coronavirus/Coronavirus_CDC_Daily_v002.log
##
## Checking for similarity of: state
## In reference but not in current:
## In current but not in reference:
##
##
## ***Differences of at least 5 and at least 5%
##
## 6 records
## Detailed output available in log: ./RInputFiles/Coronavirus/Coronavirus_CDC_Daily_v002.log
##
##
## ***Differences of at least 0 and at least 0.1%
##
## 63 records
## Detailed output available in log: ./RInputFiles/Coronavirus/Coronavirus_CDC_Daily_v002.log
##
##
## Column sums before and after applying filtering rules:
## # A tibble: 3 x 6
## isType tot_cases tot_deaths new_cases new_deaths n
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 before 5.08e+9 1.07e+8 3.21e+7 558830 27435
## 2 after 5.06e+9 1.06e+8 3.19e+7 556355 23715
## 3 pctchg 4.40e-3 3.81e-3 4.47e-3 0.00443 0.136
##
##
## Column sums before and after applying filtering rules:
## # A tibble: 3 x 5
## isType inp hosp_adult hosp_ped n
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 before 2.57e+7 1.99e+7 436353 23230
## 2 after 2.56e+7 1.98e+7 426239 22395
## 3 pctchg 5.60e-3 5.66e-3 0.0232 0.0359
## Warning in showSRID(uprojargs, format = "PROJ", multiline = "NO", prefer_proj =
## prefer_proj): Discarded datum unknown in CRS definition
identical(cdc_daily_compare[c("stateData", "dfRaw", "dfProcess", "dfPerCapita", "useClusters")],
readFromRDS("cdc_daily_test_v3")[c("stateData", "dfRaw", "dfProcess", "dfPerCapita", "useClusters")]
)
## [1] TRUE
identical(cdc_daily_compare$plotDataList[c("dfFull", "dfAgg", "plotClusters")],
readFromRDS("cdc_daily_test_v3")$plotDataList[c("dfFull", "dfAgg", "plotClusters")]
)
## [1] TRUE
The core data elements are identical, and the plots appear to convey the same information. Next steps are to download the latest data and process with existing clusters.
Updated data are downloaded and processed, using existing segments. The downloadTo argument is edited using lapply to avoid downloading data if it has previously been downloaded:
readList <- list("cdcDaily"="./RInputFiles/Coronavirus/CDC_dc_downloaded_210528.csv",
"cdcHosp"="./RInputFiles/Coronavirus/CDC_h_downloaded_210528.csv"
)
compareList <- list("cdcDaily"=readFromRDS("cdc_daily_test_v3")$dfRaw$cdcDaily,
"cdcHosp"=readFromRDS("cdc_daily_test_v3")$dfRaw$cdcHosp
)
cdc_daily_210528 <- readRunCDCDaily(thruLabel="May 28, 2021",
downloadTo=lapply(readList, FUN=function(x) if(file.exists(x)) NA else x),
readFrom=readList,
compareFile=compareList,
writeLog="./RInputFiles/Coronavirus/Coronavirus_CDC_Daily_210528.log",
useClusters=readFromRDS("cdc_daily_test_v2")$useClusters,
skipAssessmentPlots=FALSE,
brewPalette="Paired"
)
##
## -- Column specification --------------------------------------------------------
## cols(
## submission_date = col_character(),
## state = col_character(),
## tot_cases = col_double(),
## conf_cases = col_double(),
## prob_cases = col_double(),
## new_case = col_double(),
## pnew_case = col_double(),
## tot_death = col_double(),
## conf_death = col_double(),
## prob_death = col_double(),
## new_death = col_double(),
## pnew_death = col_double(),
## created_at = col_character(),
## consent_cases = col_character(),
## consent_deaths = col_character()
## )
##
## *** File has been checked for uniqueness by: state date
##
##
## Checking for similarity of: column names
## In reference but not in current:
## In current but not in reference:
##
## Checking for similarity of: date
## In reference but not in current: 0
## In current but not in reference: 26
## Detailed differences available in: ./RInputFiles/Coronavirus/Coronavirus_CDC_Daily_210528.log
##
## Checking for similarity of: state
## In reference but not in current:
## In current but not in reference:
##
##
## ***Differences of at least 5 and at least 5%
##
## 593 records
## Detailed output available in log: ./RInputFiles/Coronavirus/Coronavirus_CDC_Daily_210528.log
##
##
## ***Differences of at least 0 and at least 0.1%
##
## 39 records
## Detailed output available in log: ./RInputFiles/Coronavirus/Coronavirus_CDC_Daily_210528.log
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## state = col_character(),
## date = col_date(format = ""),
## geocoded_state = col_character()
## )
## i Use `spec()` for the full column specifications.
##
## *** File has been checked for uniqueness by: state date
##
##
## Checking for similarity of: column names
## In reference but not in current:
## In current but not in reference:
##
## Checking for similarity of: date
## In reference but not in current: 0
## In current but not in reference: 14
## Detailed differences available in: ./RInputFiles/Coronavirus/Coronavirus_CDC_Daily_210528.log
##
## Checking for similarity of: state
## In reference but not in current:
## In current but not in reference:
##
##
## ***Differences of at least 5 and at least 5%
##
## 3 records
## Detailed output available in log: ./RInputFiles/Coronavirus/Coronavirus_CDC_Daily_210528.log
##
##
## ***Differences of at least 0 and at least 0.1%
##
## 49 records
## Detailed output available in log: ./RInputFiles/Coronavirus/Coronavirus_CDC_Daily_210528.log
##
##
## Column sums before and after applying filtering rules:
## # A tibble: 3 x 6
## isType tot_cases tot_deaths new_cases new_deaths n
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 before 5.99e+9 1.24e+8 3.29e+7 577667 28969
## 2 after 5.96e+9 1.23e+8 3.28e+7 575010 25041
## 3 pctchg 4.37e-3 3.82e-3 4.55e-3 0.00460 0.136
##
##
## Column sums before and after applying filtering rules:
## # A tibble: 3 x 5
## isType inp hosp_adult hosp_ped n
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 before 2.61e+7 2.03e+7 415621 23972
## 2 after 2.60e+7 2.02e+7 405188 23109
## 3 pctchg 5.67e-3 5.73e-3 0.0251 0.0360
## Warning in showSRID(uprojargs, format = "PROJ", multiline = "NO", prefer_proj =
## prefer_proj): Discarded datum unknown in CRS definition
saveToRDS(cdc_daily_210528, ovrWrite=FALSE, ovrWriteError=FALSE)
The process appears to work as intended. Next steps are to update the county-level data process, making use of some of the functions available for CDC data processing.
The latest version of the data are downloaded and processed:
readList <- list("cdcDaily"="./RInputFiles/Coronavirus/CDC_dc_downloaded_210708.csv",
"cdcHosp"="./RInputFiles/Coronavirus/CDC_h_downloaded_210708.csv"
)
compareList <- list("cdcDaily"=readFromRDS("cdc_daily_210528")$dfRaw$cdcDaily,
"cdcHosp"=readFromRDS("cdc_daily_210528")$dfRaw$cdcHosp
)
cdc_daily_210708 <- readRunCDCDaily(thruLabel="Jul 08, 2021",
downloadTo=lapply(readList, FUN=function(x) if(file.exists(x)) NA else x),
readFrom=readList,
compareFile=compareList,
writeLog="./RInputFiles/Coronavirus/Coronavirus_CDC_Daily_210708.log",
useClusters=readFromRDS("cdc_daily_210528")$useClusters,
skipAssessmentPlots=FALSE,
brewPalette="Paired"
)
##
## -- Column specification --------------------------------------------------------
## cols(
## submission_date = col_character(),
## state = col_character(),
## tot_cases = col_double(),
## conf_cases = col_double(),
## prob_cases = col_double(),
## new_case = col_double(),
## pnew_case = col_double(),
## tot_death = col_double(),
## conf_death = col_double(),
## prob_death = col_double(),
## new_death = col_double(),
## pnew_death = col_double(),
## created_at = col_character(),
## consent_cases = col_character(),
## consent_deaths = col_character()
## )
##
## *** File has been checked for uniqueness by: state date
##
##
## Checking for similarity of: column names
## In reference but not in current:
## In current but not in reference:
##
## Checking for similarity of: date
## In reference but not in current: 0
## In current but not in reference: 40
## Detailed differences available in: ./RInputFiles/Coronavirus/Coronavirus_CDC_Daily_210708.log
##
## Checking for similarity of: state
## In reference but not in current:
## In current but not in reference:
##
##
## ***Differences of at least 5 and at least 5%
##
## 432 records
## Detailed output available in log: ./RInputFiles/Coronavirus/Coronavirus_CDC_Daily_210708.log
##
##
## ***Differences of at least 0 and at least 0.1%
##
## 43 records
## Detailed output available in log: ./RInputFiles/Coronavirus/Coronavirus_CDC_Daily_210708.log
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## state = col_character(),
## date = col_date(format = ""),
## geocoded_state = col_logical()
## )
## i Use `spec()` for the full column specifications.
##
## *** File has been checked for uniqueness by: state date
##
##
## Checking for similarity of: column names
## In reference but not in current:
## In current but not in reference: deaths_covid deaths_covid_coverage
##
## Checking for similarity of: date
## In reference but not in current: 0
## In current but not in reference: 42
## Detailed differences available in: ./RInputFiles/Coronavirus/Coronavirus_CDC_Daily_210708.log
##
## Checking for similarity of: state
## In reference but not in current:
## In current but not in reference:
##
##
## ***Differences of at least 5 and at least 5%
##
## 3 records
## Detailed output available in log: ./RInputFiles/Coronavirus/Coronavirus_CDC_Daily_210708.log
##
##
## ***Differences of at least 0 and at least 0.1%
##
## 57 records
## Detailed output available in log: ./RInputFiles/Coronavirus/Coronavirus_CDC_Daily_210708.log
##
##
## Column sums before and after applying filtering rules:
## # A tibble: 3 x 6
## isType tot_cases tot_deaths new_cases new_deaths n
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 before 7.32e+9 1.49e+8 3.35e+7 596979 31329
## 2 after 7.29e+9 1.48e+8 3.33e+7 594255 27081
## 3 pctchg 4.40e-3 3.91e-3 4.57e-3 0.00456 0.136
##
##
## Column sums before and after applying filtering rules:
## # A tibble: 3 x 5
## isType inp hosp_adult hosp_ped n
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 before 2.70e+7 2.11e+7 447142 26198
## 2 after 2.69e+7 2.10e+7 435737 25251
## 3 pctchg 5.65e-3 5.67e-3 0.0255 0.0361
## Warning in showSRID(uprojargs, format = "PROJ", multiline = "NO", prefer_proj =
## prefer_proj): Discarded datum unknown in CRS definition
saveToRDS(cdc_daily_210708, ovrWrite=FALSE, ovrWriteError=FALSE)
Vaccines data are also available for download on the CDC website:
urlVaccine <- "https://data.cdc.gov/api/views/unsk-b7fc/rows.csv?accessType=DOWNLOAD"
locVaccine <- "./RInputFiles/Coronavirus/CDC_vax_downloaded_210712.csv"
fileDownload(locVaccine, urlVaccine)
## size isdir mode
## ./RInputFiles/Coronavirus/CDC_vax_downloaded_210712.csv 4270315 FALSE 666
## mtime
## ./RInputFiles/Coronavirus/CDC_vax_downloaded_210712.csv 2021-07-12 09:01:36
## ctime
## ./RInputFiles/Coronavirus/CDC_vax_downloaded_210712.csv 2021-07-12 09:01:11
## atime exe
## ./RInputFiles/Coronavirus/CDC_vax_downloaded_210712.csv 2021-07-12 09:01:36 no
The file has many fields, including:
An individual can live in one state but be vaccinated in another state. Per the CDC field descriptions:
Fully vaccinated (series complete) metrics is defined as “Total number of people who are fully vaccinated (have second dose of a two-dose vaccine or one dose of a single-dose vaccine) based on the jurisdiction where recipient lives”
vaxRaw_210712 <- fileRead(locVaccine)
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## Date = col_character(),
## Location = col_character()
## )
## i Use `spec()` for the full column specifications.
glimpse(vaxRaw_210712)
## Rows: 13,618
## Columns: 69
## $ Date <chr> "07/11/2021", "07/11/2021", "07~
## $ MMWR_week <dbl> 28, 28, 28, 28, 28, 28, 28, 28,~
## $ Location <chr> "FL", "IA", "WI", "MO", "ND", "~
## $ Distributed <dbl> 25229075, 3506895, 6207245, 620~
## $ Distributed_Janssen <dbl> 1694500, 188700, 318700, 311400~
## $ Distributed_Moderna <dbl> 10217260, 1460040, 2633920, 254~
## $ Distributed_Pfizer <dbl> 13317315, 1858155, 3254625, 334~
## $ Distributed_Unk_Manuf <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
## $ Dist_Per_100K <dbl> 117466, 111151, 106609, 101065,~
## $ Distributed_Per_100k_12Plus <dbl> 134944, 131036, 124150, 118471,~
## $ Distributed_Per_100k_18Plus <dbl> 146274, 144422, 136248, 130124,~
## $ Distributed_Per_100k_65Plus <dbl> 560978, 634211, 610203, 584049,~
## $ Administered <dbl> 21527263, 3073527, 6017859, 520~
## $ Administered_12Plus <dbl> 21519017, 3073495, 6017495, 520~
## $ Administered_18Plus <dbl> 20764735, 2932330, 5732822, 499~
## $ Administered_65Plus <dbl> 7906498, 959982, 1837143, 16454~
## $ Administered_Janssen <dbl> 1048774, 128869, 240681, 174610~
## $ Administered_Moderna <dbl> 8579143, 1297742, 2470502, 1977~
## $ Administered_Pfizer <dbl> 11846137, 1646788, 3306004, 305~
## $ Administered_Unk_Manuf <dbl> 53209, 128, 672, 466, 0, 2580, ~
## $ Administered_Fed_LTC <dbl> 405647, 138684, 182382, 158723,~
## $ Administered_Fed_LTC_Residents <dbl> 209000, 62049, 85961, 85652, 30~
## $ Administered_Fed_LTC_Staff <dbl> 119292, 45853, 59621, 49923, 22~
## $ Administered_Fed_LTC_Unk <dbl> 77355, 30782, 36800, 23148, 137~
## $ Administered_Fed_LTC_Dose1 <dbl> 230126, 87469, 115893, 93047, 3~
## $ Administered_Fed_LTC_Dose1_Residents <dbl> 117587, 35533, 50724, 48321, 15~
## $ Administered_Fed_LTC_Dose1_Staff <dbl> 67708, 28547, 36168, 29112, 117~
## $ Administered_Fed_LTC_Dose1_Unk <dbl> 44831, 23389, 29001, 15614, 811~
## $ Admin_Per_100K <dbl> 100231, 97415, 103356, 84885, 8~
## $ Admin_Per_100k_12Plus <dbl> 115099, 114842, 120355, 99498, ~
## $ Admin_Per_100k_18Plus <dbl> 120391, 120760, 125835, 104872,~
## $ Admin_Per_100k_65Plus <dbl> 175804, 173610, 180600, 154933,~
## $ Recip_Administered <dbl> 21237913, 3069562, 5974955, 511~
## $ Administered_Dose1_Recip <dbl> 11763654, 1638173, 3163125, 281~
## $ Administered_Dose1_Pop_Pct <dbl> 54.8, 51.9, 54.3, 45.9, 44.4, 5~
## $ Administered_Dose1_Recip_12Plus <dbl> 11756137, 1638108, 3162679, 281~
## $ Administered_Dose1_Recip_12PlusPop_Pct <dbl> 62.9, 61.2, 63.3, 53.8, 53.0, 6~
## $ Administered_Dose1_Recip_18Plus <dbl> 11323495, 1562036, 3007052, 269~
## $ Administered_Dose1_Recip_18PlusPop_Pct <dbl> 65.7, 64.3, 66.0, 56.6, 56.0, 7~
## $ Administered_Dose1_Recip_65Plus <dbl> 4061097, 490657, 920145, 859645~
## $ Administered_Dose1_Recip_65PlusPop_Pct <dbl> 90.3, 88.7, 90.5, 80.9, 83.5, 8~
## $ Series_Complete_Yes <dbl> 10086805, 1537214, 2951037, 243~
## $ Series_Complete_Pop_Pct <dbl> 47.0, 48.7, 50.7, 39.7, 39.4, 5~
## $ Series_Complete_12Plus <dbl> 10085351, 1537191, 2950892, 243~
## $ Series_Complete_12PlusPop_Pct <dbl> 53.9, 57.4, 59.0, 46.6, 47.0, 6~
## $ Series_Complete_18Plus <dbl> 9776152, 1473385, 2825253, 2353~
## $ Series_Complete_18PlusPop_Pct <dbl> 56.7, 60.7, 62.0, 49.4, 50.0, 6~
## $ Series_Complete_65Plus <dbl> 3551211, 475114, 889344, 779851~
## $ Series_Complete_65PlusPop_Pct <dbl> 79.0, 85.9, 87.4, 73.4, 74.5, 8~
## $ Series_Complete_Janssen <dbl> 1031811, 126334, 232849, 175144~
## $ Series_Complete_Moderna <dbl> 3807918, 629990, 1161367, 90416~
## $ Series_Complete_Pfizer <dbl> 5229909, 780797, 1556520, 13597~
## $ Series_Complete_Unk_Manuf <dbl> 17167, 93, 301, 101, 1, 792, 75~
## $ Series_Complete_Janssen_12Plus <dbl> 1031093, 126332, 232832, 175129~
## $ Series_Complete_Moderna_12Plus <dbl> 3807322, 629983, 1161353, 90415~
## $ Series_Complete_Pfizer_12Plus <dbl> 5229769, 780783, 1556406, 13597~
## $ Series_Complete_Unk_Manuf_12Plus <dbl> 17167, 93, 301, 101, 1, 792, 74~
## $ Series_Complete_Janssen_18Plus <dbl> 1030595, 126273, 232707, 174990~
## $ Series_Complete_Moderna_18Plus <dbl> 3806853, 629858, 1161109, 90392~
## $ Series_Complete_Pfizer_18Plus <dbl> 4921576, 717161, 1431144, 12746~
## $ Series_Complete_Unk_Manuf_18Plus <dbl> 17128, 93, 293, 93, 1, 781, 742~
## $ Series_Complete_Janssen_65Plus <dbl> 179075, 11728, 24812, 33179, 35~
## $ Series_Complete_Moderna_65Plus <dbl> 1755611, 252070, 432381, 357176~
## $ Series_Complete_Pfizer_65Plus <dbl> 1604988, 211256, 432022, 389451~
## $ Series_Complete_Unk_Manuf_65Plus <dbl> 11537, 60, 129, 45, 0, 464, 326~
## $ Series_Complete_FedLTC <dbl> 174063, 50507, 65859, 65388, 30~
## $ Series_Complete_FedLTC_Residents <dbl> 89676, 26063, 34733, 36971, 141~
## $ Series_Complete_FedLTC_Staff <dbl> 50661, 16950, 23251, 20660, 105~
## $ Series_Complete_FedLTC_Unknown <dbl> 33726, 7494, 7875, 7757, 552, 9~
vaxRenamer <- c("Location"="state",
"Date"="date",
"Admin_Per_100K"="Admin_Per_100k"
)
vaxKeeper <- c("state", "date", "MMWR_week",
"Administered", "Administered_12Plus", "Administered_18Plus", "Administered_65Plus",
"Admin_Per_100k", "Admin_Per_100k_12Plus", "Admin_Per_100k_18Plus", "Admin_Per_100k_65Plus",
"Recip_Administered",
"Series_Complete_Yes",
"Series_Complete_12Plus", "Series_Complete_18Plus", "Series_Complete_65Plus",
"Series_Complete_Pop_Pct",
"Series_Complete_12PlusPop_Pct", "Series_Complete_18PlusPop_Pct", "Series_Complete_65PlusPop_Pct"
)
vaxProcessed_210712 <- vaxRaw_210712 %>%
colRenamer(vecRename=vaxRenamer) %>%
colSelector(vecSelect=vaxKeeper) %>%
colMutater(selfList=list("date"=lubridate::mdy))
glimpse(vaxProcessed_210712)
## Rows: 13,618
## Columns: 20
## $ state <chr> "FL", "IA", "WI", "MO", "ND", "VA", "US"~
## $ date <date> 2021-07-11, 2021-07-11, 2021-07-11, 202~
## $ MMWR_week <dbl> 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, ~
## $ Administered <dbl> 21527263, 3073527, 6017859, 5209747, 649~
## $ Administered_12Plus <dbl> 21519017, 3073495, 6017495, 5209418, 648~
## $ Administered_18Plus <dbl> 20764735, 2932330, 5732822, 4999072, 626~
## $ Administered_65Plus <dbl> 7906498, 959982, 1837143, 1645444, 19396~
## $ Admin_Per_100k <dbl> 100231, 97415, 103356, 84885, 85282, 110~
## $ Admin_Per_100k_12Plus <dbl> 115099, 114842, 120355, 99498, 101825, 1~
## $ Admin_Per_100k_18Plus <dbl> 120391, 120760, 125835, 104872, 107725, ~
## $ Admin_Per_100k_65Plus <dbl> 175804, 173610, 180600, 154933, 161847, ~
## $ Recip_Administered <dbl> 21237913, 3069562, 5974955, 5114570, 618~
## $ Series_Complete_Yes <dbl> 10086805, 1537214, 2951037, 2439175, 300~
## $ Series_Complete_12Plus <dbl> 10085351, 1537191, 2950892, 2439129, 299~
## $ Series_Complete_18Plus <dbl> 9776152, 1473385, 2825253, 2353696, 2909~
## $ Series_Complete_65Plus <dbl> 3551211, 475114, 889344, 779851, 89281, ~
## $ Series_Complete_Pop_Pct <dbl> 47.0, 48.7, 50.7, 39.7, 39.4, 52.9, 48.0~
## $ Series_Complete_12PlusPop_Pct <dbl> 53.9, 57.4, 59.0, 46.6, 47.0, 61.6, 56.1~
## $ Series_Complete_18PlusPop_Pct <dbl> 56.7, 60.7, 62.0, 49.4, 50.0, 63.7, 58.8~
## $ Series_Complete_65PlusPop_Pct <dbl> 79.0, 85.9, 87.4, 73.4, 74.5, 81.4, 79.0~
Counts by state are created:
vaxState <- vaxProcessed_210712 %>%
group_by(state) %>%
filter(date==max(date)) %>%
select(state, date, Administered, Recip_Administered, Series_Complete_Yes) %>%
ungroup() %>%
arrange(-Administered)
vaxState
## # A tibble: 65 x 5
## state date Administered Recip_Administered Series_Complete_Yes
## <chr> <date> <dbl> <dbl> <dbl>
## 1 US 2021-07-11 334151648 334151648 159266536
## 2 CA 2021-07-11 43609176 43607956 20176353
## 3 TX 2021-07-11 26245668 25536886 12230164
## 4 NY 2021-07-11 22233988 22166452 10763740
## 5 FL 2021-07-11 21527263 21237913 10086805
## 6 PA 2021-07-11 14126934 14159474 6486641
## 7 IL 2021-07-11 13206252 13344907 5971607
## 8 OH 2021-07-11 10835735 10710147 5318622
## 9 NJ 2021-07-11 10029522 10332551 5006341
## 10 MI 2021-07-11 9562802 9766213 4780127
## # ... with 55 more rows
vaxState %>%
filter(!(state %in% c(state.abb, "DC")))
## # A tibble: 14 x 5
## state date Administered Recip_Administered Series_Complete_Yes
## <chr> <date> <dbl> <dbl> <dbl>
## 1 US 2021-07-11 334151648 334151648 159266536
## 2 LTC 2021-07-11 7899665 0 0
## 3 VA2 2021-07-11 5381413 5381413 2706838
## 4 DD2 2021-07-11 4382578 4382578 1888769
## 5 PR 2021-07-11 3832854 3860036 1839207
## 6 IH2 2021-07-11 1459669 1459669 668566
## 7 BP2 2021-07-11 197049 197049 97863
## 8 GU 2021-07-11 194248 194467 93628
## 9 VI 2021-07-11 79692 77067 35899
## 10 MP 2021-07-11 57308 57358 27509
## 11 FM 2021-07-11 51997 52375 26444
## 12 AS 2021-07-11 48178 48436 21997
## 13 MH 2021-07-11 34127 34184 16365
## 14 RP 2021-07-11 25416 25637 13284
vaxState %>%
filter(!(state == "US")) %>%
mutate(pctComplete=Series_Complete_Yes/sum(Series_Complete_Yes)) %>%
mutate(is50DC=state %in% c(state.abb, "DC")) %>%
group_by(is50DC) %>%
summarize(n=n(), across(where(is.numeric), sum), .groups="drop")
## # A tibble: 2 x 6
## is50DC n Administered Recip_Administered Series_Complete_Yes pctComplete
## <lgl> <int> <dbl> <dbl> <dbl> <dbl>
## 1 FALSE 13 23644194 15770269 7436369 0.0455
## 2 TRUE 51 328226539 327261454 156057188 0.955
vaxProcessed_210712 %>%
filter(state=="US") %>%
select(state, date, Administered, Recip_Administered, Series_Complete_Yes) %>%
pivot_longer(-c(state, date)) %>%
ggplot(aes(x=date, y=value/1000000)) +
geom_line(aes(group=name, color=name)) +
labs(x="", y="Number of Doses/People (millions)", title="All-US Vaccination totals")
Roughly 5% of completely vaccinated individuals are tracked to entities that do not map back to states. These will be deleted for further analysis, which may lead to some disconnects.
Next steps are to continue processing the data and to integrate with the other state-level metrics.
Implied populations and vaccinations by subgroup are calculated:
vaxImplied_210712 <- vaxProcessed_210712 %>%
mutate(popTot=100*Series_Complete_Yes/Series_Complete_Pop_Pct,
pop65Plus=100*Series_Complete_65Plus/Series_Complete_65PlusPop_Pct,
pop18Plus=100*Series_Complete_18Plus/Series_Complete_18PlusPop_Pct,
pop12Plus=100*Series_Complete_12Plus/Series_Complete_12PlusPop_Pct,
pop1864=pop18Plus-pop65Plus,
pop1217=pop12Plus-pop18Plus,
pop0011=popTot-pop12Plus,
vax65Plus=Series_Complete_65Plus,
vax1864=Series_Complete_18Plus-Series_Complete_65Plus,
vax1217=Series_Complete_12Plus-Series_Complete_18Plus,
vax0011=Series_Complete_Yes-Series_Complete_12Plus
)
popData <- vaxImplied_210712 %>%
filter(state %in% c(state.abb, "DC", "PR", "US")) %>%
group_by(state) %>%
summarize(across(.cols=c(pop65Plus, pop1864, pop1217, pop0011),
.fns=list(mu=~mean(.x, na.rm=TRUE),
sdmu=~sd(.x, na.rm=TRUE)/mean(.x, na.rm=TRUE),
rangemu=~diff(range(.x, na.rm=TRUE)/mean(.x, na.rm=TRUE))
)
),
.groups="drop"
)
popData %>%
select(state, contains("_rangemu")) %>%
pivot_longer(-state) %>%
ggplot(aes(x=fct_reorder(state, value, .fun=max), y=value)) +
geom_point() +
coord_flip() +
facet_wrap(~name, nrow=1) +
labs(y="Range divided by mean", x=NULL, title="Consistency of population estimates by subgroup and state")
## Warning: Removed 2 rows containing missing values (geom_point).
popData %>%
select(state, contains("_mu")) %>%
pivot_longer(-state) %>%
group_by(state) %>%
mutate(pct65Plus=sum(ifelse(name=="pop65Plus_mu", value, 0))/sum(value)) %>%
ungroup() %>%
ggplot(aes(x=fct_reorder(state, pct65Plus), y=value)) +
geom_col(aes(fill=name), position="fill") +
coord_flip() +
scale_fill_discrete("") +
labs(y="Proportion of population", x=NULL, title="Population breakout by state")
## Warning: Removed 4 rows containing missing values (geom_col).
vaxImplied_210712 %>%
filter(state=="US") %>%
select(state, date, starts_with("vax")) %>%
pivot_longer(-c(state, date)) %>%
ggplot(aes(x=date, y=value)) +
geom_line(aes(group=name, color=name))
vaxImplied_210712 %>%
filter(state=="US") %>%
select(state, date, starts_with("vax")) %>%
pivot_longer(-c(state, date)) %>%
mutate(eq0=(value==0), lt0=(value<0)) %>%
filter(value<=0) %>%
group_by(eq0, lt0, name) %>%
summarize(across(date, .fns=list(min=min, max=max)), .groups="drop")
## # A tibble: 5 x 5
## eq0 lt0 name date_min date_max
## <lgl> <lgl> <chr> <date> <date>
## 1 FALSE TRUE vax1217 2021-03-05 2021-05-12
## 2 TRUE FALSE vax0011 2020-12-13 2021-03-04
## 3 TRUE FALSE vax1217 2020-12-13 2021-03-04
## 4 TRUE FALSE vax1864 2020-12-13 2021-03-04
## 5 TRUE FALSE vax65Plus 2020-12-13 2021-03-04
Population estimates are generally consistent by state across dates, with the greatest variability in the 12-17 age estimates (expected since it is the smallest group where rounded percent vaccinated would have the most impact).
Distributions by age and state appear reasonable.
There has clearly been a change in tracking where fully vaccinated are tracked using age buckets:
Next steps are to modify code so that subtotal statistics by age bucket are used only when where appropriate.
The availability of fields for state ‘US’ (full nation) is explored:
vaxProcessed_210712 %>%
filter(state=="US") %>%
pivot_longer(-c(state, date)) %>%
mutate(valType=case_when(value < 0 ~ "red", value==0 ~ "orange", value > 0 ~ "green")) %>%
ggplot(aes(x=date, y=fct_reorder(name, valType=="green", .fun=sum), fill=valType)) +
geom_tile() +
scale_fill_identity() +
labs(x=NULL, y=NULL, title="Data availability by metric", subtitle="Red is negative, orange is zero")
In the early months, data are available only for administration. The “series complete” metrics are introduced later, with the 12Plus bucket added even later as authorizations for use in ages 12-17 were added.
A comparison of states/DC to US is made for each of the key metrics:
vaxProcessed_210712 %>%
mutate(stateType=case_when(state=="US" ~ "US", state %in% c(state.abb, "DC") ~ "state/DC", TRUE ~ "other")) %>%
group_by(stateType, date, MMWR_week) %>%
summarize(across(where(is.numeric), .fns=sum), .groups="drop") %>%
pivot_longer(-c(stateType, date, MMWR_week)) %>%
filter(!(str_detect(name, "Per|Pct"))) %>%
ggplot(aes(x=date, y=value)) +
geom_line(aes(group=stateType, color=stateType)) +
facet_wrap(~name, scales="free_y")
In general, the sum of the states and DC are close to the total for US. Per capita and percentage metrics cannot be summed and were not compared directly.
Next steps are to adapt the population splits to account for the variable timing of initial data availability A heuristic can likely be used for the split of 65Plus in the early days, with 12Plus and 18Plus assumed to be equal (no usage in 0-17 group) prior to age being broken out.
An assumption is made that Series_Complete_Yes maps to the oldest group still left to populate when data breakouts are incomplete:
vaxImplied_210712_v2 <- vaxProcessed_210712 %>%
mutate(popTot=100*Series_Complete_Yes/Series_Complete_Pop_Pct,
pop65Plus=100*Series_Complete_65Plus/Series_Complete_65PlusPop_Pct,
pop18Plus=100*Series_Complete_18Plus/Series_Complete_18PlusPop_Pct,
pop12Plus=100*Series_Complete_12Plus/Series_Complete_12PlusPop_Pct,
pop1864=pop18Plus-pop65Plus,
pop1217=pop12Plus-pop18Plus,
pop0011=popTot-pop12Plus,
vax65Plus=Series_Complete_65Plus,
vax1864=Series_Complete_18Plus-Series_Complete_65Plus,
vax1217=ifelse(Series_Complete_12Plus>0, Series_Complete_12Plus, Series_Complete_Yes)-Series_Complete_18Plus,
vax0011=Series_Complete_Yes-vax65Plus-vax1864-vax1217
)
popData_v2 <- vaxImplied_210712_v2 %>%
filter(state %in% c(state.abb, "DC", "US")) %>%
group_by(state) %>%
summarize(across(.cols=c(popTot, pop65Plus, pop1864, pop1217, pop0011),
.fns=list(mu=~mean(.x, na.rm=TRUE),
sdmu=~sd(.x, na.rm=TRUE)/mean(.x, na.rm=TRUE),
rangemu=~diff(range(.x, na.rm=TRUE)/mean(.x, na.rm=TRUE))
)
),
.groups="drop"
)
popData_v2 %>%
select(state, contains("_rangemu")) %>%
pivot_longer(-state) %>%
ggplot(aes(x=fct_reorder(state, value, .fun=max), y=value)) +
geom_point() +
coord_flip() +
facet_wrap(~name, nrow=1) +
labs(y="Range divided by mean",
x=NULL,
title="Consistency of population estimates by subgroup and state"
)
popData_v2 %>%
select(state, contains("_mu"), -contains("popTot")) %>%
pivot_longer(-state) %>%
group_by(state) %>%
mutate(pct65Plus=sum(ifelse(name=="pop65Plus_mu", value, 0))/sum(value)) %>%
ungroup() %>%
ggplot(aes(x=fct_reorder(state, pct65Plus), y=value)) +
geom_col(aes(fill=name), position="fill") +
coord_flip() +
scale_fill_discrete("") +
labs(y="Proportion of population", x=NULL, title="Population breakout by state")
vaxImplied_210712_v2 %>%
filter(state=="US") %>%
select(state, date, starts_with("vax")) %>%
pivot_longer(-c(state, date)) %>%
ggplot(aes(x=date, y=value)) +
geom_line(aes(group=name, color=name))
vaxImplied_210712_v2 %>%
filter(state=="US") %>%
select(state, date, starts_with("vax")) %>%
pivot_longer(-c(state, date)) %>%
mutate(eq0=(value==0), lt0=(value<0)) %>%
filter(value<=0) %>%
group_by(eq0, lt0, name) %>%
summarize(across(date, .fns=list(min=min, max=max)), .groups="drop")
## # A tibble: 4 x 5
## eq0 lt0 name date_min date_max
## <lgl> <lgl> <chr> <date> <date>
## 1 TRUE FALSE vax0011 2020-12-13 2021-05-12
## 2 TRUE FALSE vax1217 2020-12-13 2021-03-04
## 3 TRUE FALSE vax1864 2020-12-13 2021-03-04
## 4 TRUE FALSE vax65Plus 2020-12-13 2021-03-04
Data appear reasonable for further use, though with some anomalies still related to the breakouts by age. Metrics per million on a rolling-7 basis are created:
popDataUse <- popData_v2 %>%
filter(state %in% c(state.abb, "DC")) %>%
select(state, contains("_mu")) %>%
pivot_longer(-state) %>%
mutate(ageGroup=stringr::str_replace_all(name, "pop|_mu", "")) %>%
rename(pop=value) %>%
select(state, ageGroup, pop)
vaxDataUse <- vaxImplied_210712_v2 %>%
filter(state %in% c(state.abb, "DC")) %>%
select(state, date, vaxTot=Series_Complete_Yes, starts_with("vax")) %>%
pivot_longer(-c(state, date)) %>%
mutate(ageGroup=stringr::str_replace_all(name, "vax", "")) %>%
rename(vax=value) %>%
select(state, date, ageGroup, vax)
popVaxData <- vaxDataUse %>%
inner_join(popDataUse, by=c("state", "ageGroup")) %>%
mutate(vaxpct=vax/pop) %>%
arrange(state, ageGroup, date) %>%
group_by(state, ageGroup) %>%
helperRollingAgg(origVar="vaxpct", newName="vaxpct7") %>%
ungroup()
popVaxData %>%
filter(!is.na(vaxpct7)) %>%
ggplot(aes(x=date, y=vaxpct7)) +
geom_line(aes(group=state, color=state.region[match(state, state.abb)]), alpha=0.5) +
lims(y=c(0, 1)) +
facet_wrap(~ageGroup) +
labs(title="Percent Fully Vaccinated", x=NULL, y="Rolling 7 'Series Complete' percentage") +
scale_color_discrete("Census\nRegion")
popVaxData %>%
filter(!is.na(vaxpct7)) %>%
ggplot(aes(x=date, y=vaxpct7)) +
geom_line(aes(group=ageGroup, color=ageGroup)) +
lims(y=c(0, 1)) +
facet_wrap(~state) +
labs(title="Percent Fully Vaccinated", x=NULL, y="Rolling 7 'Series Complete' percentage") +
scale_color_discrete("Age")
Next steps are to incorporate these steps as a reproducible function.
The function readQCRawCDCDaily() is copied and applied:
# Function to read and check a raw data file
readQCRawCDCDaily <- function(fileName,
writeLog=NULL,
ovrwriteLog=TRUE,
dfRef=NULL,
urlType=NULL,
url=NULL,
getData=TRUE,
ovrWriteDownload=FALSE,
vecRename=NULL,
selfList=NULL,
fullList=NULL,
uniqueBy=NULL,
step3Group=NULL,
step3Vals=NULL,
step4KeyVars=NULL,
step5PlotItems=NULL,
step6AggregateList=NULL,
inferVars=list("url"=urlMapper,
"vecRename"=renMapper,
"selfList"=selfListMapper,
"fullList"=fullListMapper,
"uniqueBy"=uqMapper,
"step3Group"=checkControlGroupMapper,
"step3Vals"=checkControlVarsMapper,
"step4KeyVars"=checkSimilarityMapper,
"step5PlotItems"=plotSimilarityMapper,
"step6AggregateList"=keyAggMapper
)
) {
# FUNCTION ARGUMENTS
# fileName: the location where downloaded data either is, or will be, stored
# writeLog: the external file location for printing (NULL means use the main log stdout)
# ovrwriteLog: boolean, if using an external log, should it be started from scratch (overwritten)?
# dfRef: a reference data frame for comparison (either NULL or NA means do not run comparisons)
# urlType: character vector that can be mapped using urlMapper and keyVarMapper
# url: direct URL passed as character string
# NOTE that if both url and urlType are NULL, no file will be downloaded
# getData: boolean, should an attempt be made to get new data using urlType or url?
# ovrWriteDownload: boolean, if fileName already exists, should it be overwritten?
# vecRename: vector for renaming c('existing name'='new name'), can be any length from 0 to ncol(df)
# NULL means infer from urlType, if not available there use c()
# selfList: list for functions to apply to self, list('variable'=fn) will apply variable=fn(variable)
# processed in order, so more than one function can be applied to self
# NULL means infer from urlType, if not available in mapping file use list()
# fullList: list for general functions to be applied, list('new variable'=expression(code))
# will create 'new variable' as eval(expression(code))
# for now, requires passing an expression
# NULL means infer from urlType, use list() if not in mapping file
# uniqueBy: combination of variables for checking uniqueness
# NULL means infer from data, keep as NULL (meaning use-all) if cannot be inferred
# step3Group: variable to be used as the x-axis (grouping) for step 3 plots
# NULL means infer from data
# step3Vals: values to be plotted on the y-axis for step 3 plots
# NULL means infer from data
# step4KeyVars: list of parameters to be passed as keyVars= in step 4
# NULL means infer from urlType
# step5PlotItems: items to be plotted in step 5
# NULL means infer from urlType
# step6AggregateList: drives the elements to be passed to compareAggregate() and flagLargeDelta()
# NULL means infer from urlType
# inferVars: vector of c('variable'='mapper') for inferring parameter values when passed as NULL
# Step 0a: Use urlType to infer key variables if passed as NULL
for (vrbl in names(inferVars)) {
mapper <- inferVars[[vrbl]]
if (is.null(get(vrbl))) {
if (urlType %in% names(mapper)) assign(vrbl, mapper[[urlType]])
else if ("default" %in% names(mapper)) assign(vrbl, mapper[["default"]])
}
}
# Step 1: Download a new file (if requested)
if (!is.null(url) & isTRUE(getData)) fileDownload(fileName=fileName, url=url, ovrWrite=ovrWriteDownload)
else cat("\nNo file has been downloaded, will use existing file:", fileName, "\n")
# Step 2: Read file, rename and mutate variables, confirm uniqueness by expected levels
dfRaw <- fileRead(fileName) %>%
colRenamer(vecRename) %>%
colMutater(selfList=selfList, fullList=fullList) %>%
checkUniqueRows(uniqueBy=uniqueBy)
# Step 3: Plot basic control totals for new cases and new deaths by month
dfRaw %>%
checkControl(groupBy=step3Group, useVars=step3Vals, printControls=FALSE, na.rm=TRUE) %>%
helperLinePlot(x=step3Group, y="newValue", facetVar="name", facetScales="free_y", groupColor="name")
# If there is no file for comparison, return the data
if (is.null(dfRef) | if(length(dfRef)==1) is.na(dfRef) else FALSE) return(dfRaw)
# Step 4b: Check similarity of existing and reference file
# ovrWriteLog=FALSE since everything should be an append after the opening text line in step 0
diffRaw <- checkSimilarity(df=dfRaw,
ref=dfRef,
keyVars=step4KeyVars,
writeLog=writeLog,
ovrwriteLog=FALSE
)
# Step 5: Plot the similarity checks
plotSimilarity(diffRaw, plotItems=step5PlotItems)
# Step 6: Plot and report on differences in aggregates
helperAggMap <- function(x) {
h1 <- compareAggregate(df=dfRaw, ref=dfRef, grpVar=x$grpVar, numVars=x$numVars,
sameUniverse=x$sameUniverse, plotData=x$plotData, isLine=x$isLine,
returnDelta=x$returnDelta)
if (isTRUE(x$flagLargeDelta)) {
h2 <- flagLargeDelta(h1, pctTol=x$pctTol, absTol=x$absTol, sortBy=x$sortBy,
dropNA=x$dropNA, printAll=x$printAll
)
if (is.null(writeLog)) print(h2)
else {
cat(nrow(h2), " records", sep="")
txt <- paste0("\n\n***Differences of at least ",
x$absTol,
" and at least ",
round(100*x$pctTol, 3), "%\n\n"
)
printLog(h2, txt=txt, writeLog=writeLog)
}
}
}
lapply(step6AggregateList, FUN=helperAggMap)
cat("\n\n")
# Return the raw data file
dfRaw
}
# Run without downloading data and without a comparison file
vaxRaw_210712_func <- readQCRawCDCDaily(fileName="./RInputFiles/Coronavirus/CDC_vax_downloaded_210712.csv",
url="https://data.cdc.gov/api/views/unsk-b7fc/rows.csv?accessType=DOWNLOAD",
getData=FALSE,
vecRename=c("Location"="state",
"Date"="date",
"Admin_Per_100K"="Admin_Per_100k"
),
selfList=list("date"=lubridate::mdy),
uniqueBy=c("date", "state"),
step3Group=c("date"),
step3Vals=c("Administered",
"Series_Complete_Yes",
"Series_Complete_12Plus",
"Series_Complete_18Plus",
"Series_Complete_65Plus"
),
inferVars=list()
)
##
## No file has been downloaded, will use existing file: ./RInputFiles/Coronavirus/CDC_vax_downloaded_210712.csv
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## Date = col_character(),
## Location = col_character()
## )
## i Use `spec()` for the full column specifications.
##
## *** File has been checked for uniqueness by: date state
While there is double-counting due to the “US” record being included, the general process for a basic file read is working as intended. Next steps are to update the process to allow for comparison to an existing file.
The latest vaccines data are downloaded, with results cached:
urlVaccine <- "https://data.cdc.gov/api/views/unsk-b7fc/rows.csv?accessType=DOWNLOAD"
locVaccine <- "./RInputFiles/Coronavirus/CDC_vax_downloaded_210717.csv"
fileDownload(locVaccine, urlVaccine)
## size isdir mode
## ./RInputFiles/Coronavirus/CDC_vax_downloaded_210717.csv 4406078 FALSE 666
## mtime
## ./RInputFiles/Coronavirus/CDC_vax_downloaded_210717.csv 2021-07-17 08:17:29
## ctime
## ./RInputFiles/Coronavirus/CDC_vax_downloaded_210717.csv 2021-07-17 08:17:26
## atime exe
## ./RInputFiles/Coronavirus/CDC_vax_downloaded_210717.csv 2021-07-17 08:17:29 no
The function readQCRawCDCDaily() is applied using the previous data as the control:
# Run without downloading data and with a comparison file
vaxRaw_210717_func <- readQCRawCDCDaily(fileName="./RInputFiles/Coronavirus/CDC_vax_downloaded_210717.csv",
dfRef=vaxRaw_210712_func,
url="https://data.cdc.gov/api/views/unsk-b7fc/rows.csv?accessType=DOWNLOAD",
getData=FALSE,
vecRename=c("Location"="state",
"Date"="date",
"Admin_Per_100K"="Admin_Per_100k"
),
selfList=list("date"=lubridate::mdy),
uniqueBy=c("date", "state"),
step3Group=c("date"),
step3Vals=c("Administered",
"Series_Complete_Yes",
"Series_Complete_12Plus",
"Series_Complete_18Plus",
"Series_Complete_65Plus"
),
step4KeyVars=list(date=list(label='date', countOnly=TRUE, convChar=TRUE),
state=list(label='state', countOnly=FALSE)
),
step5PlotItems=c("date"),
step6AggregateList=list("l1"=list("grpVar"="date",
"numVars"=c("Administered",
"Series_Complete_Yes",
"Series_Complete_12Plus",
"Series_Complete_18Plus",
"Series_Complete_65Plus"
),
"sameUniverse"=NA,
"plotData"=TRUE,
"isLine"=TRUE,
"returnDelta"=TRUE,
"flagLargeDelta"=TRUE,
"pctTol"=0.01,
"absTol"=1,
"sortBy"=c("name", "pctDelta", "absDelta"),
"dropNA"=TRUE,
"printAll"=TRUE
),
"l3"=list("grpVar"="state",
"numVars"=c("Administered",
"Series_Complete_Yes",
"Series_Complete_12Plus",
"Series_Complete_18Plus",
"Series_Complete_65Plus"
),
"sameUniverse"="date",
"plotData"=TRUE,
"isLine"=FALSE,
"returnDelta"=TRUE,
"flagLargeDelta"=TRUE,
"pctTol"=0.001,
"absTol"=0,
"sortBy"=c("name", "pctDelta", "absDelta"),
"dropNA"=TRUE,
"printAll"=TRUE
)
),
inferVars=list()
)
##
## No file has been downloaded, will use existing file: ./RInputFiles/Coronavirus/CDC_vax_downloaded_210717.csv
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## Date = col_character(),
## Location = col_character()
## )
## i Use `spec()` for the full column specifications.
##
## *** File has been checked for uniqueness by: date state
##
##
## Checking for similarity of: column names
## In reference but not in current:
## In current but not in reference:
##
## Checking for similarity of: date
## In reference but not in current: 0
## In current but not in reference: 5
##
## Checking for similarity of: state
## In reference but not in current:
## In current but not in reference:
##
##
## ***Differences of at least 1 and at least 1%
##
## [1] date name newValue refValue absDelta pctDelta
## <0 rows> (or 0-length row.names)
##
##
## ***Differences of at least 0 and at least 0.1%
##
## [1] state name newValue refValue absDelta pctDelta
## <0 rows> (or 0-length row.names)
The function works well for reading a raw vaccines data file, running basic checks, and comparing to a previous vaccines data file. Next steps are to adapt the function for processing a vaccines data file.
The function processRawFile() is leveraged:
# Generic function for processing a raw file
processRawFile <- function(df,
vecRename=c(),
vecSelect=NULL,
lstCombo=list(),
lstFilter=list(),
lstExclude=list()
) {
# FUNCTION ARGUMENTS:
# df: the raw data frame or tibble
# vecRename: vector for renaming c('existing name'='new name'), can be any length from 0 to ncol(df)
# vecSelect: vector of columns to select (run after vecRename), NULL means select all columns
# lstCombo: a nested list of combinations to be applied
# each element of the list should include comboVar, uqVars, vecCombo, and fn
# lstFilter: a list for filtering records, of form list("field"=c("allowed values"))
# lstExclude: a list for filtering records, of form list("field"=c("disallowed values"))
# STEP 1: Rename and select variables (selection occurs AFTER renaming)
dfProcess <- df %>%
colRenamer(vecRename=vecRename) %>%
colSelector(vecSelect=vecSelect)
# STEP 2: Combine multiple records to a single record
for (ctr in seq_along(lstCombo)) {
dfProcess <- dfProcess %>%
combineRows(comboVar=lstCombo[[ctr]]$comboVar,
uqVars=lstCombo[[ctr]]$uqVars,
vecCombo=lstCombo[[ctr]]$vecCombo,
fn=lstCombo[[ctr]]$fn
)
}
# STEP 3: Filter records
qcOrig <- dfProcess %>%
summarize(across(where(is.numeric), sum, na.rm=TRUE), n=n()) %>%
mutate(isType="before")
dfProcess <- dfProcess %>%
rowFilter(lstFilter=lstFilter, lstExclude=lstExclude)
# STEP 4: Report on differences
cat("\nColumn sums before and after applying filtering rules:\n")
dfProcess %>%
summarize(across(where(is.numeric), sum, na.rm=TRUE), n=n()) %>%
mutate(isType="after") %>%
bind_rows(qcOrig) %>%
arrange(desc(isType)) %>%
bind_rows(mutate(summarize(., across(where(is.numeric), function(x) (max(x)-min(x))/max(x))),
isType="pctchg"
)
) %>%
select(isType, everything()) %>%
print()
cat("\n")
# Return the processed data file
dfProcess
}
vaxProc_210717_func <- processRawFile(vaxRaw_210717_func,
vecRename=c(),
vecSelect=c("date", "state", "MMWR_week",
"Administered", "Admin_Per_100k",
"Series_Complete_Yes", "Series_Complete_Pop_Pct",
"Series_Complete_12Plus", "Series_Complete_12PlusPop_Pct",
"Series_Complete_18Plus", "Series_Complete_18PlusPop_Pct",
"Series_Complete_65Plus", "Series_Complete_65PlusPop_Pct"
),
lstCombo=list(),
lstFilter=list("state"=c(state.abb, "DC")),
lstExclude=list()
)
##
## Column sums before and after applying filtering rules:
## # A tibble: 3 x 13
## isType MMWR_week Administered Admin_Per_100k Series_Complete~ Series_Complete~
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 before 2.49e+5 7.14e+10 628437370 2.87e+10 258606.
## 2 after 1.97e+5 3.39e+10 531817808 1.39e+10 218997.
## 3 pctchg 2.10e-1 5.25e- 1 0.154 5.16e- 1 0.153
## # ... with 7 more variables: Series_Complete_12Plus <dbl>,
## # Series_Complete_12PlusPop_Pct <dbl>, Series_Complete_18Plus <dbl>,
## # Series_Complete_18PlusPop_Pct <dbl>, Series_Complete_65Plus <dbl>,
## # Series_Complete_65PlusPop_Pct <dbl>, n <dbl>
vaxProc_210717_func
## # A tibble: 10,965 x 13
## date state MMWR_week Administered Admin_Per_100k Series_Complete_Yes
## <date> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 2021-07-16 FL 28 21688774 100983 10167736
## 2 2021-07-16 KS 28 2540782 87213 1249272
## 3 2021-07-16 SC 28 4292812 83376 2045648
## 4 2021-07-16 AR 28 2310080 76548 1062254
## 5 2021-07-16 ND 28 652273 85593 301349
## 6 2021-07-16 MN 28 5979756 106031 2987234
## 7 2021-07-16 DE 28 1065570 109428 501985
## 8 2021-07-16 IA 28 3087945 97872 1543626
## 9 2021-07-16 NV 28 2867707 93103 1330894
## 10 2021-07-16 DC 28 885481 125467 379400
## # ... with 10,955 more rows, and 7 more variables:
## # Series_Complete_Pop_Pct <dbl>, Series_Complete_12Plus <dbl>,
## # Series_Complete_12PlusPop_Pct <dbl>, Series_Complete_18Plus <dbl>,
## # Series_Complete_18PlusPop_Pct <dbl>, Series_Complete_65Plus <dbl>,
## # Series_Complete_65PlusPop_Pct <dbl>
Next steps are to run the per-capita process for conversion of Administered and Series_Complete_Yes based on the same state population data used for cases, deaths, and hospitalizations.
The function createPerCapita() is leveraged:
# Function to extract and format key state data
getStateData <- function(df=readFromRDS("statePop2019"),
renameVars=c("stateAbb"="state", "NAME"="name", "pop_2019"="pop"),
keepVars=c("state", "name", "pop")
) {
# FUNCTION ARGUMENTS:
# df: the data frame containing state data
# renameVars: variables to be renamed, using named list with format "originalName"="newName"
# keepVars: variables to be kept in the final file
# Rename variables where appropriate
names(df) <- ifelse(is.na(renameVars[names(df)]), names(df), renameVars[names(df)])
# Return file with only key variables kept
df %>%
select_at(vars(all_of(keepVars)))
}
useVars <- c("state", "date", "Administered", "Series_Complete_Yes")
vaxPerCap_210717_func <- createPerCapita(select(vaxProc_210717_func, all_of(useVars)),
uqBy=c("state", "date"),
popData=getStateData(),
mapper=c("Administered"="vxapm", "Series_Complete_Yes"="vxcpm"),
)
vaxPerCap_210717_func
## # A tibble: 10,965 x 8
## state date Administered Series_Complete_Yes vxapm vxcpm vxapm7 vxcpm7
## <chr> <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 AK 2020-12-14 0 0 0 0 NA NA
## 2 AL 2020-12-14 0 0 0 0 NA NA
## 3 AR 2020-12-14 0 0 0 0 NA NA
## 4 AZ 2020-12-14 0 0 0 0 NA NA
## 5 CA 2020-12-14 0 0 0 0 NA NA
## 6 CO 2020-12-14 0 0 0 0 NA NA
## 7 CT 2020-12-14 0 0 0 0 NA NA
## 8 DC 2020-12-14 0 0 0 0 NA NA
## 9 DE 2020-12-14 0 0 0 0 NA NA
## 10 FL 2020-12-14 0 0 0 0 NA NA
## # ... with 10,955 more rows
vaxPerCap_210717_func %>%
select(state, date, vxapm7, vxcpm7) %>%
pivot_longer(-c(state, date)) %>%
filter(!is.na(value), name=="vxcpm7") %>%
mutate(region=ifelse(state=="DC", "South Atlantic", as.character(state.division)[match(state, state.abb)])) %>%
ggplot(aes(x=date, y=value/1000000)) +
geom_line(aes(group=state), alpha=0.25) +
geom_line(data=~summarize(group_by(., region, date), value=median(value), .groups="drop"),
aes(color=region)
) +
facet_wrap(~region) +
lims(y=c(0, 1)) +
labs(x=NULL,
y="Proportion Fully Vaccinated (of total population)",
title="Evolution of fully vaccinated by state and census division",
subtitle="Colored line is median in region, gray line is individual states in region"
) +
theme(legend.position="none")
The createPerCapita() function is updated to allow for keeping variables without calculating per-million or rolling-7 aggregates:
# Generic function to create per-capita metrics using an existing file and source of population data
createPerCapita <- function(lst,
uqBy,
popData,
mapper,
asIsVars=c(),
lstSortBy=uqBy,
fnJoin=dplyr::full_join,
popJoinBy="state",
popVar="pop",
k=7,
mult=1000000,
...
) {
# FUNCTION ARGUMENTS:
# lst: A list containing one or more files to be joined OR a data frame that is already joined
# uqBy: character string that the input file is unique by (will be the join keys if a list is passed)
# popData: file containing population data that can be joined to the processed lst
# mapper: mapping file of c('current name'='per capita name') for mapping variables
# asIsVars: variables to be kept, but without creating pm or pm7
# lstSortBy: the sorting that should be used for creating rolling metrics
# fnJoin: The function to be used for joining files
# popJoinBy: character string for the variable(s) to be used in joining popData to lst
# popVar: character string for the variable in popData that represents population
# k: time perior for rolling aggregations
# mult: the unit for the per-capita data (default 1 million means make metrics per million)
# ...: other arguments to be passed to combineFiles()
# Step 1: If a list has been passed, use a joining process to create a data frame
if ("list" %in% class(lst)) lst <- combineFiles(lst, byVars=uqBy, fn=fnJoin, ...)
# Step 2: Sort the data using sortBy
df <- dplyr::arrange(lst, across(all_of(lstSortBy)))
# Step 3: Check that all variables other than uqBy and asIsVars can be mapped using mapper
keyVars <- setdiff(names(df), c(uqBy, asIsVars))
if (any(isFALSE(keyVars %in% mapper))) stop("\nVariable is missing in per capita mapper file\n")
# Step 4: Run the per capita mapping process
df <- helperMakePerCapita(df,
mapVars=mapper[keyVars],
popData=popData,
k=k,
byVar=popJoinBy,
sortVar=setdiff(lstSortBy, popJoinBy),
popVar=popVar,
mult=mult
)
# Return the data frame
df
}
The updated process is then run, keeping the breakout for 65+ and 18+:
uqVars <- c("state", "date")
perCapVars <- c("Administered", "Series_Complete_Yes")
asIsVars <- c("Series_Complete_65Plus", "Series_Complete_65PlusPop_Pct",
"Series_Complete_18Plus", "Series_Complete_18PlusPop_Pct",
"Admin_Per_100k", "Series_Complete_Pop_Pct"
)
vaxPerCap_210717_func_v2 <- createPerCapita(select(vaxProc_210717_func, all_of(c(uqVars, perCapVars, asIsVars))),
uqBy=uqVars,
asIsVars=asIsVars,
popData=getStateData(),
mapper=c("Administered"="vxapm", "Series_Complete_Yes"="vxcpm")
) %>%
colRenamer(c("Series_Complete_Yes"="vxc",
"Administered"="vxa",
"Series_Complete_Pop_Pct"="vxcpoppct",
"Series_Complete_65Plus"="vxcgte65",
"Series_Complete_65PlusPop_Pct"="vxcgte65pct",
"Series_Complete_18Plus"="vxcgte18",
"Series_Complete_18PlusPop_Pct"="vxcgte18pct"
)
)
vaxPerCap_210717_func_v2
## # A tibble: 10,965 x 14
## state date vxa vxc vxcgte65 vxcgte65pct vxcgte18 vxcgte18pct
## <chr> <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 AK 2020-12-14 0 0 0 0 0 0
## 2 AL 2020-12-14 0 0 0 0 0 0
## 3 AR 2020-12-14 0 0 0 0 0 0
## 4 AZ 2020-12-14 0 0 0 0 0 0
## 5 CA 2020-12-14 0 0 0 0 0 0
## 6 CO 2020-12-14 0 0 0 0 0 0
## 7 CT 2020-12-14 0 0 0 0 0 0
## 8 DC 2020-12-14 0 0 0 0 0 0
## 9 DE 2020-12-14 0 0 0 0 0 0
## 10 FL 2020-12-14 0 0 0 0 0 0
## # ... with 10,955 more rows, and 6 more variables: Admin_Per_100k <dbl>,
## # vxcpoppct <dbl>, vxapm <dbl>, vxcpm <dbl>, vxapm7 <dbl>, vxcpm7 <dbl>
# Check consistency of 'Admin_Per_100k' and 'vxapm'
vaxPerCap_210717_func_v2 %>%
filter(date==max(date)) %>%
ggplot(aes(x=Admin_Per_100k, y=vxapm)) +
geom_text(aes(label=state)) +
geom_abline(slope=10, intercept=0, lty=2) +
labs(x="Raw data administered per 100k",
y="Function-calculated adminsitered per million",
title="Consistency of raw data and function-calculated per capita data",
subtitle="Dotted line is per-million at 10x per-100k (expected)"
)
# Check consistency of 'vxcpoppct' and 'vxcpm'
vaxPerCap_210717_func_v2 %>%
filter(date==max(date)) %>%
ggplot(aes(x=vxcpoppct, y=vxcpm)) +
geom_text(aes(label=state)) +
geom_abline(slope=10000, intercept=0, lty=2) +
labs(x="Raw data percent of population completely vaccinated",
y="Function-calculated completely vaccinated per million",
title="Consistency of raw data and function-calculated per capita data",
subtitle="Dotted line is per-million at 10,000x per-100 (expected)"
)
The raw data and per-capita totals are aligned, suggesting that population estimates used in the datasets are very similar (functions use 2019 estimates as per getStateData()).
The colMutater() function is added to include:
Variables are added as follows:
# Conversions for 18-64 and 0-17
subGroupList <- list("vxc1864"=expression(vxcgte18-vxcgte65),
"vxc0017"=expression(vxc-vxcgte18)
)
# Conversions for per-day
perDayFunc <- function(x) ifelse(row_number()==1, x, ifelse(lag(x)==0, 0, x-lag(x)))
perDayList <- list("vxa_perday"=expression(perDayFunc(vxa)),
"vxc_perday"=expression(perDayFunc(vxc)),
"vxcgte65_perday"=expression(perDayFunc(vxcgte65)),
"vxc1864_perday"=expression(perDayFunc(vxc1864)),
"vxc0017_perday"=expression(perDayFunc(vxc0017))
)
vaxPerCap_210717_func_v3 <- vaxPerCap_210717_func_v2 %>%
colMutater(fullList=subGroupList) %>%
arrange(date, state) %>%
group_by(state) %>%
colMutater(fullList=perDayList) %>%
ungroup()
# Check that files are identical for same variables
sapply(names(vaxPerCap_210717_func_v2),
FUN=function(x) all.equal(vaxPerCap_210717_func_v2[[x]], vaxPerCap_210717_func_v3[[x]])
) %>%
t() %>%
t()
## [,1]
## state TRUE
## date TRUE
## vxa TRUE
## vxc TRUE
## vxcgte65 TRUE
## vxcgte65pct TRUE
## vxcgte18 TRUE
## vxcgte18pct TRUE
## Admin_Per_100k TRUE
## vxcpoppct TRUE
## vxapm TRUE
## vxcpm TRUE
## vxapm7 TRUE
## vxcpm7 TRUE
# Plot evolution of vaccines by age
vaxPerCap_210717_func_v3 %>%
select(date, vxc, vxcgte65, vxc1864, vxc0017) %>%
group_by(date) %>%
summarize(across(.fns=sum)) %>%
pivot_longer(-date) %>%
ggplot(aes(x=date)) +
geom_point(data=~filter(., name=="vxc"), aes(y=value/1000000)) +
geom_col(data=~filter(., name!="vxc"), aes(y=value/1000000, fill=name), position="stack") +
labs(x=NULL,
y="Completely Vaccinated (millions)",
title="Evolution of fully vaccinated by age group",
subtitle="Dots are total people fully vaccinated"
) +
scale_fill_discrete("Age")
# Plot evolution of vaccines administered per day
vaxPerCap_210717_func_v3 %>%
select(date, vxa, vxa_perday) %>%
group_by(date) %>%
summarize(across(where(is.numeric), sum)) %>%
pivot_longer(-date) %>%
group_by(name) %>%
mutate(value7=zoo::rollmean(value, k=7, fill=NA)) %>%
ungroup() %>%
ggplot(aes(x=date)) +
geom_line(aes(y=value/1000000)) +
geom_line(data=~filter(., name=="vxa_perday", !is.na(value7)), aes(y=value7/1000000), color="red", lwd=2) +
facet_wrap(~c("vxa"="Cumulative", "vxa_perday"="Daily")[name], scales="free_y") +
labs(x=NULL,
y="Vaccines Adminsitered (millions)",
title="Evolution of vaccines administered",
subtitle="Red line is rolling 7-day average"
)
Variables appear to be created as intended.
Next, total population is estimated and plots of vaccines administered per capita are created:
# Plot evolution of vaccines administered per day
vaxPerCap_210717_func_v3 %>%
select(state, date, vxa, vxa_perday, Admin_Per_100k) %>%
group_by(state) %>%
mutate(pop=median(100000*vxa/Admin_Per_100k, na.rm=TRUE)) %>%
ungroup() %>%
pivot_longer(-c(state, date, pop)) %>%
group_by(state, pop, name) %>%
mutate(value7=zoo::rollmean(value, k=7, fill=NA)) %>%
ungroup() %>%
ggplot(aes(x=date)) +
geom_line(data=~filter(., name=="vxa_perday", !is.na(value7), state != "NM"),
aes(y=1000*value7/pop, group=state)
) +
facet_wrap(~state) +
labs(x=NULL,
y="Vaccines Adminsitered (per thousand)",
title="Evolution of vaccines administered (rolling 7-day average)"
)
# Plot evolution of vaccines administered (cumulative)
vaxPerCap_210717_func_v3 %>%
select(state, date, vxa, vxa_perday, Admin_Per_100k) %>%
group_by(state) %>%
mutate(pop=median(100000*vxa/Admin_Per_100k, na.rm=TRUE)) %>%
ungroup() %>%
pivot_longer(-c(state, date, pop)) %>%
group_by(state, pop, name) %>%
mutate(value7=zoo::rollmean(value, k=7, fill=NA)) %>%
ungroup() %>%
ggplot(aes(x=date)) +
geom_line(data=~filter(., name=="vxa", !is.na(value)),
aes(y=1000*value/pop, group=state)
) +
geom_hline(yintercept=1000, lty=2) +
facet_wrap(~state) +
labs(x=NULL,
y="Vaccines Adminsitered (per thousand)",
title="Evolution of vaccines administered (cumulative)"
)
Estimates are made for population 65+, 18-64, and 0-17 based on completion percentages:
popEstAgeState <- vaxPerCap_210717_func_v3 %>%
select(state, date, vxcgte65, vxcgte65pct, vxcgte18, vxcgte18pct, vxc, vxcpoppct) %>%
mutate(popgte65=100*vxcgte65/vxcgte65pct, popgte18=100*vxcgte18/vxcgte18pct, pop=100*vxc/vxcpoppct) %>%
group_by(state) %>%
summarize(across(c(popgte65, popgte18, pop), median, na.rm=TRUE), .groups="drop") %>%
mutate(pop1864=popgte18-popgte65, pop0017=pop-popgte18)
popEstAgeState
## # A tibble: 51 x 6
## state popgte65 popgte18 pop pop1864 pop0017
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 AK 91583. 551552. 731518. 459969. 179966.
## 2 AL 849784. 3815294. 4903279. 2965510. 1087985.
## 3 AR 523915. 2317203. 3016324. 1793288. 699121.
## 4 AZ 1308603. 5637959. 7278853. 4329356. 1640894.
## 5 CA 5838155. 30623548. 39513431. 24785393. 8889884.
## 6 CO 842448. 4499815. 5758852. 3657367. 1259037.
## 7 CT 630249. 2837921. 3565957. 2207671. 728037.
## 8 DC 87347. 577646. 705734. 490299. 128088.
## 9 DE 188903. 770105. 973906. 581202. 203801.
## 10 FL 4497505. 17248010. 21479412. 12750505. 4231402.
## # ... with 41 more rows
popEstAgeState %>%
summarize(across(where(is.numeric), sum))
## # A tibble: 1 x 5
## popgte65 popgte18 pop pop1864 pop0017
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 54058078. 255206800. 328239240. 201148722. 73032440.
popEstAgeState %>%
select(-popgte18, -pop) %>%
pivot_longer(-state) %>%
group_by(state) %>%
mutate(pctg65=ifelse(name=="popgte65", value, 0)/sum(value)) %>%
ggplot(aes(x=fct_reorder(state, pctg65, .fun=max), y=value)) +
geom_col(aes(fill=name), position="fill") +
coord_flip() +
labs(x=NULL,
y="Proportion",
title="Distribution of population by state",
subtitle="Estimated from reported vaccine completion percentages by sub-group"
) +
scale_fill_discrete("Age Group")
Metrics for evolution of complete vaccination by age cohort are calculated and plotted:
popState <- popEstAgeState %>%
select(state, popgte65, pop1864, pop0017) %>%
pivot_longer(-state) %>%
mutate(age=str_replace_all(name, "pop", "")) %>%
select(state, age, pop=value)
popState
## # A tibble: 153 x 3
## state age pop
## <chr> <chr> <dbl>
## 1 AK gte65 91583.
## 2 AK 1864 459969.
## 3 AK 0017 179966.
## 4 AL gte65 849784.
## 5 AL 1864 2965510.
## 6 AL 0017 1087985.
## 7 AR gte65 523915.
## 8 AR 1864 1793288.
## 9 AR 0017 699121.
## 10 AZ gte65 1308603.
## # ... with 143 more rows
vaxPerCap_210717_func_v3 %>%
select(state, date, vxcgte65_perday, vxc1864_perday, vxc0017_perday) %>%
pivot_longer(-c(state, date)) %>%
mutate(age=stringr::str_replace_all(name, "vxc|_perday", "")) %>%
select(state, date, age, perday=value) %>%
left_join(popState, by=c("state", "age")) %>%
mutate(perday_percap=perday/pop) %>%
arrange(state, date, age) %>%
group_by(state, age) %>%
helperRollingAgg("perday_percap", newName="perday_percap7") %>%
ungroup() %>%
filter(!is.na(perday_percap7), state != "DC") %>%
ggplot(aes(x=date, y=1000*perday_percap7)) +
geom_line(aes(group=state, color=state.region[match(state, state.abb)])) +
labs(x=NULL,
y="Completed per thousand (rolling 7-day)",
title="Newly fully vaccinated by day",
subtitle="Persons vaccinated before age-breakouts included NOT counted"
) +
facet_wrap(~age) +
scale_color_discrete("Census\nRegion")
There are discontinuities in the data, particularly for the age group 0-17 bucket. This is likely driven by differences in timing of age breakouts based on authorizations for vaccines by age cohort. Next steps are to investigate and correct for discontinuities, particularly those showing negative completed vaccinations.
The big spike then decline in 0017 is investigated:
deltaData_210717 <- vaxPerCap_210717_func_v3 %>%
select(state, date, vxc, vxcgte65, vxcgte18) %>%
group_by(date) %>%
summarize(across(where(is.numeric), sum)) %>%
mutate(across(where(is.numeric), .fn=function(x) ifelse(lag(x)==0, NA, x - lag(x)), .names="d_{.col}"))
deltaData_210717 %>%
select(date, starts_with("d")) %>%
pivot_longer(-date) %>%
filter(!is.na(value)) %>%
ggplot(aes(x=date, y=value)) +
geom_line(aes(group=name, color=name))
deltaData_210717 %>%
filter(d_vxc < d_vxcgte18)
## # A tibble: 1 x 7
## date vxc vxcgte65 vxcgte18 d_vxc d_vxcgte65 d_vxcgte18
## <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2021-04-06 62035890 29998523 61906243 612945 318833 2924665
deltaData_210717 %>%
filter(date %in% (as.Date("2021-04-01")+0:10))
## # A tibble: 11 x 7
## date vxc vxcgte65 vxcgte18 d_vxc d_vxcgte65 d_vxcgte18
## <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2021-04-01 55181012 28135317 55083159 1461437 688366 1454688
## 2 2021-04-02 57047089 28913986 56942235 1866077 778669 1859076
## 3 2021-04-03 58902844 29557456 58696677 1855755 643470 1754442
## 4 2021-04-04 60453008 29624425 58857402 1550164 66969 160725
## 5 2021-04-05 61422945 29679690 58981578 969937 55265 124176
## 6 2021-04-06 62035890 29998523 61906243 612945 318833 2924665
## 7 2021-04-07 63419456 30411378 63284674 1383566 412855 1378431
## 8 2021-04-08 65178735 30843713 65034300 1759279 432335 1749626
## 9 2021-04-09 67158210 31349712 67001971 1979475 505999 1967671
## 10 2021-04-10 69617607 31989344 69447387 2459397 639632 2445416
## 11 2021-04-11 71525283 32417679 71342554 1907676 428335 1895167
There is a situation on April 4-6 where age breakouts for vaccination increases do not make sense. An assumption is made for April 4-6 that d_vxcgte18 will be set to d_vxc for each state:
# Conversions for per-day
perDayFunc <- function(x) ifelse(row_number()==1, x, ifelse(lag(x)==0, 0, x-lag(x)))
perDayList <- list("vxa_perday"=expression(perDayFunc(vxa)),
"vxc_perday"=expression(perDayFunc(vxc)),
"vxcgte65_perday"=expression(perDayFunc(vxcgte65)),
"vxc1864_perday"=expression(perDayFunc(vxc1864)),
"vxc0017_perday"=expression(perDayFunc(vxc0017))
)
vaxPerCap_210717_func_v4 <- vaxPerCap_210717_func_v2 %>%
arrange(state, date) %>%
group_by(state) %>%
mutate(vxcgte18=ifelse(date %in% c(as.Date("2021-04-04")), lag(vxcgte18) + vxc - lag(vxc), vxcgte18)) %>%
mutate(vxcgte18=ifelse(date %in% c(as.Date("2021-04-05")), lag(vxcgte18) + vxc - lag(vxc), vxcgte18)) %>%
mutate(vxcgte18=ifelse(date %in% c(as.Date("2021-04-06")), lag(vxcgte18) + vxc - lag(vxc), vxcgte18)) %>%
ungroup() %>%
colMutater(fullList=subGroupList) %>%
arrange(date, state) %>%
group_by(state) %>%
colMutater(fullList=perDayList) %>%
ungroup()
all.equal(vaxPerCap_210717_func_v4, vaxPerCap_210717_func_v3)
## [1] "Component \"vxcgte18\": Mean relative difference: 0.02034966"
## [2] "Component \"vxc1864\": Mean relative difference: 0.03968432"
## [3] "Component \"vxc0017\": Mean relative difference: 6.03064"
## [4] "Component \"vxc1864_perday\": Mean relative difference: 1.011459"
## [5] "Component \"vxc0017_perday\": Mean relative difference: 40.83178"
all.equal(vaxPerCap_210717_func_v4[!(vaxPerCap_210717_func_v3$date %in% c(as.Date("2021-04-04")+0:3)), ],
vaxPerCap_210717_func_v3[!(vaxPerCap_210717_func_v3$date %in% c(as.Date("2021-04-04")+0:3)), ]
)
## [1] TRUE
deltaData_210717_v4 <- vaxPerCap_210717_func_v4 %>%
select(state, date, vxc, vxcgte65, vxcgte18) %>%
group_by(date) %>%
summarize(across(where(is.numeric), sum)) %>%
mutate(across(where(is.numeric), .fn=function(x) ifelse(lag(x)==0, NA, x - lag(x)), .names="d_{.col}"))
deltaData_210717_v4 %>%
select(date, starts_with("d")) %>%
pivot_longer(-date) %>%
filter(!is.na(value)) %>%
ggplot(aes(x=date, y=value)) +
geom_line(aes(group=name, color=name))
deltaData_210717_v4 %>%
filter(d_vxc < d_vxcgte18)
## # A tibble: 1 x 7
## date vxc vxcgte65 vxcgte18 d_vxc d_vxcgte65 d_vxcgte18
## <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2021-04-07 63419456 30411378 63284674 1383566 412855 1454951
vaxPerCap_210717_func_v4 %>%
select(state, date, vxcgte65_perday, vxc1864_perday, vxc0017_perday) %>%
pivot_longer(-c(state, date)) %>%
mutate(age=stringr::str_replace_all(name, "vxc|_perday", "")) %>%
select(state, date, age, perday=value) %>%
left_join(popState, by=c("state", "age")) %>%
mutate(perday_percap=perday/pop) %>%
arrange(state, date, age) %>%
group_by(state, age) %>%
helperRollingAgg("perday_percap", newName="perday_percap7") %>%
ungroup() %>%
filter(!is.na(perday_percap7), state != "DC") %>%
ggplot(aes(x=date, y=1000*perday_percap7)) +
geom_line(aes(group=state, color=state.region[match(state, state.abb)])) +
labs(x=NULL,
y="Completed per thousand (rolling 7-day)",
title="Newly fully vaccinated by day",
subtitle="Persons vaccinated before age-breakouts included NOT counted"
) +
facet_wrap(~age) +
scale_color_discrete("Census\nRegion")
The data appear improved, though there are still a handful of states with negative vaccinations completed on given days. This is an area for further investigation.
A function is written to update key variables by day:
updateByDay <- function(df,
dateStart,
dateEnd=NULL,
nDates=NULL,
varGroup=c("state"),
varSort=c("date"),
exprList=list()
) {
# FUNCTION ARGUMENTS
# df: the data frame to be processed
# Convert dateStart to date if not already in that format
if ("character" %in% class(dateStart)) dateStart <- as.Date(dateStart)
if (!("Date") %in% class(dateStart)) stop("\nArgument dateStart must be a Date object or character YYYY-MM-DD\n")
# Create dateEnd from nDates if needed
if (is.null(dateEnd)) {
if (is.null(nDates)) stop("\nMust pass either dateEnd or nDates\n")
dateEnd <- dateStart + nDates - 1
}
if ("character" %in% class(dateEnd)) dateEnd <- as.Date(dateEnd)
if (!("Date") %in% class(dateEnd)) stop("\nArgument dateEnd must be a Date object or character YYYY-MM-DD\n")
# Declare the dates to be investigated
keyDates <- seq.Date(from=dateStart, to=dateEnd, by=1)
cat("\nData will be modified as needed for dates:", keyDates, "\n")
# Arrange and group the data as requested
df <- df %>%
arrange(across(all_of(c(varGroup, varSort)))) %>%
group_by(across(all_of(varGroup)))
# Make updates for each of the keyDates
for (keyDate in keyDates) {
df <- df %>%
mutate(modThis=(date %in% keyDate)) %>%
colMutater(fullList=exprList) %>%
select(-modThis)
}
df
}
modList <- list("vxcgte18"=expression(ifelse(modThis, lag(vxcgte18) + vxc - lag(vxc), vxcgte18)))
vaxPerCap_210717_func_v4 <- updateByDay(vaxPerCap_210717_func_v2,
dateStart="2021-04-03",
dateEnd="2021-04-06",
exprList=modList
) %>%
colMutater(fullList=subGroupList) %>%
colMutater(fullList=perDayList) %>%
ungroup()
##
## Data will be modified as needed for dates: 18720 18721 18722 18723
vaxPerCap_210717_func_v4 %>%
select(state, date, vxcgte65_perday, vxc1864_perday, vxc0017_perday) %>%
pivot_longer(-c(state, date)) %>%
mutate(age=stringr::str_replace_all(name, "vxc|_perday", "")) %>%
select(state, date, age, perday=value) %>%
left_join(popState, by=c("state", "age")) %>%
mutate(perday_percap=perday/pop) %>%
arrange(state, date, age) %>%
group_by(state, age) %>%
helperRollingAgg("perday_percap", newName="perday_percap7") %>%
ungroup() %>%
filter(!is.na(perday_percap7), state != "DC") %>%
ggplot(aes(x=date, y=1000*perday_percap7)) +
geom_line(aes(group=state, color=state.region[match(state, state.abb)])) +
labs(x=NULL,
y="Completed per thousand (rolling 7-day)",
title="Newly fully vaccinated by day",
subtitle="Persons vaccinated before age-breakouts included NOT counted"
) +
facet_wrap(~age) +
scale_color_discrete("Census\nRegion")
vaxPerCap_210717_func_v4 %>%
filter(vxcgte65_perday < 0) %>%
arrange(vxcgte65_perday) %>%
select(state, date, vxcgte65_perday, vxc1864_perday, vxc0017_perday, everything())
## # A tibble: 36 x 21
## state date vxcgte65_perday vxc1864_perday vxc0017_perday vxa vxc
## <chr> <date> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 TX 2021-04-06 -567992 693861 0 1.33e7 4.80e6
## 2 TX 2021-05-13 -135100 227765 3424 2.07e7 9.06e6
## 3 TX 2021-05-06 -115815 230012 3647 1.97e7 8.37e6
## 4 TX 2021-04-29 -112167 250804 3933 1.86e7 7.65e6
## 5 TX 2021-05-25 -73677 113485 9227 2.24e7 9.97e6
## 6 TX 2021-06-21 -57128 81490 6487 2.50e7 1.15e7
## 7 TX 2021-06-14 -55998 86975 6021 2.43e7 1.10e7
## 8 TX 2021-05-18 -55966 97475 7649 2.14e7 9.51e6
## 9 TX 2021-06-08 -48890 77092 10403 2.36e7 1.07e7
## 10 TX 2021-06-28 -39123 73748 5748 2.56e7 1.19e7
## # ... with 26 more rows, and 14 more variables: vxcgte65 <dbl>,
## # vxcgte65pct <dbl>, vxcgte18 <dbl>, vxcgte18pct <dbl>, Admin_Per_100k <dbl>,
## # vxcpoppct <dbl>, vxapm <dbl>, vxcpm <dbl>, vxapm7 <dbl>, vxcpm7 <dbl>,
## # vxc1864 <dbl>, vxc0017 <dbl>, vxa_perday <dbl>, vxc_perday <dbl>
vaxPerCap_210717_func_v4 %>%
filter(vxc1864_perday < 0) %>%
arrange(vxc1864_perday) %>%
select(state, date, vxcgte65_perday, vxc1864_perday, vxc0017_perday, everything())
## # A tibble: 47 x 21
## state date vxcgte65_perday vxc1864_perday vxc0017_perday vxa vxc
## <chr> <date> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 NJ 2021-06-25 -26417 -98167 -1631 9.67e6 4.78e6
## 2 TX 2021-04-19 159499 -60513 4017 1.67e7 6.56e6
## 3 FL 2021-04-06 69450 -44626 0 1.06e7 3.88e6
## 4 MI 2021-04-06 39659 -28104 0 4.94e6 1.95e6
## 5 NY 2021-04-06 85828 -25877 0 1.08e7 4.15e6
## 6 IL 2021-04-06 36950 -25200 0 6.63e6 2.36e6
## 7 OH 2021-04-06 40886 -24029 0 5.94e6 2.25e6
## 8 OR 2021-04-06 34523 -23691 0 2.09e6 8.12e5
## 9 MO 2021-04-06 23747 -20457 0 2.86e6 1.08e6
## 10 PA 2021-04-06 44807 -20157 0 6.80e6 2.45e6
## # ... with 37 more rows, and 14 more variables: vxcgte65 <dbl>,
## # vxcgte65pct <dbl>, vxcgte18 <dbl>, vxcgte18pct <dbl>, Admin_Per_100k <dbl>,
## # vxcpoppct <dbl>, vxapm <dbl>, vxcpm <dbl>, vxapm7 <dbl>, vxcpm7 <dbl>,
## # vxc1864 <dbl>, vxc0017 <dbl>, vxa_perday <dbl>, vxc_perday <dbl>
vaxPerCap_210717_func_v4 %>%
filter(vxc0017_perday < 0) %>%
arrange(vxc0017_perday) %>%
select(state, date, vxcgte65_perday, vxc1864_perday, vxc0017_perday, everything())
## # A tibble: 6 x 21
## state date vxcgte65_perday vxc1864_perday vxc0017_perday vxa vxc
## <chr> <date> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 WV 2021-05-15 1109 4234 -2298 1248326 587339
## 2 NJ 2021-06-25 -26417 -98167 -1631 9667905 4782589
## 3 NH 2021-04-07 4795 5910 -1468 829371 282496
## 4 DC 2021-04-14 501 3176 -68 471911 147799
## 5 AR 2021-06-11 -4 21 -1 2130617 969875
## 6 ND 2021-04-20 18 48 -1 527930 226960
## # ... with 14 more variables: vxcgte65 <dbl>, vxcgte65pct <dbl>,
## # vxcgte18 <dbl>, vxcgte18pct <dbl>, Admin_Per_100k <dbl>, vxcpoppct <dbl>,
## # vxapm <dbl>, vxcpm <dbl>, vxapm7 <dbl>, vxcpm7 <dbl>, vxc1864 <dbl>,
## # vxc0017 <dbl>, vxa_perday <dbl>, vxc_perday <dbl>
The data continue to be in better shape, with rolling-7 smoothing out many of the negative/positive swings. Next steps are to further investigate Texas (65+) and the April 6 data for 65+ and 18-64. It appears that most of the 65+ data is a bolus on April 6, which may need to be smoothed backwards.
Function updateByDay() is updated to allow for subsetting by specific states:
updateByDay <- function(df,
dateStart,
dateEnd=NULL,
nDates=NULL,
allDates=FALSE,
updateStates=NULL,
varGroup=c("state"),
varSort=c("date"),
exprList=list()
) {
# FUNCTION ARGUMENTS
# df: the data frame to be processed
# dateStart: the starting date for the changes
# dateEnd: the ending date for the changes (NULL means infer from dateStart and nDates)
# nDates: the number of days to include (if not NULL, dateEnd is set to dateStart + nDates - 1)
# allDates: boolean, if TRUE run the function for all dates at once rather than date by date
# updateStates: the states to be updated (NULL means all)
# varGroup: data should be grouped by this variable
# exprList: expression list for variable changes, passed as fullList to colMutater()
# Convert dateStart to date if not already in that format
if ("character" %in% class(dateStart)) dateStart <- as.Date(dateStart)
if (!("Date") %in% class(dateStart)) stop("\nArgument dateStart must be a Date object or character YYYY-MM-DD\n")
# Create dateEnd from nDates if needed
if (is.null(dateEnd) & !isTRUE(allDates)) {
if (is.null(nDates)) stop("\nMust pass either dateEnd or nDates or specify allDates=TRUE\n")
dateEnd <- dateStart + nDates - 1
}
if ("character" %in% class(dateEnd)) dateEnd <- as.Date(dateEnd)
if (!("Date") %in% class(dateEnd) & !isTRUE(allDates))
stop("\nArgument allDates must be TRUE or dateEnd must be a Date object or character YYYY-MM-DD\n")
# Declare the dates to be investigated
if (!isTRUE(allDates)) {
keyDates <- seq.Date(from=dateStart, to=dateEnd, by=1)
cat("\nData will be modified as needed for dates:", keyDates, "\n")
}
# Set the key states to be investigated if passed as NULL
if (is.null(updateStates)) updateStates <- df %>% pull(state) %>% unique() %>% sort()
# Arrange and group the data as requested
df <- df %>%
arrange(across(all_of(c(varGroup, varSort)))) %>%
group_by(across(all_of(varGroup)))
# Make updates for each of the keyDates
if (!isTRUE(allDates)) {
for (keyDate in keyDates) {
df <- df %>%
mutate(modThis=(date %in% keyDate) & (state %in% updateStates)) %>%
colMutater(fullList=exprList) %>%
select(-modThis)
}
} else {
df <- df %>%
mutate(modThis=state %in% updateStates) %>%
colMutater(fullList=exprList) %>%
select(-modThis)
}
df
}
fullModList <- list("vxcgte18"=expression(ifelse(modThis, lag(vxcgte18) + vxc - lag(vxc), vxcgte18)))
txModList <- list("vxc"=expression(ifelse(modThis, zoo::rollmean(vxc, k=7, fill=NA), vxc)),
"vxcgte18"=expression(ifelse(modThis, zoo::rollmean(vxcgte18, k=7, fill=NA), vxcgte18)),
"vxcgte65"=expression(ifelse(modThis, zoo::rollmean(vxcgte65, k=7, fill=NA), vxcgte65))
)
vaxPerCap_210717_func_v4_new <- updateByDay(vaxPerCap_210717_func_v2,
dateStart="2021-04-03",
dateEnd="2021-04-06",
exprList=fullModList
) %>%
updateByDay(dateStart="2020-01-01", allDates=TRUE, updateStates=c("TX"), exprList=txModList) %>%
colMutater(fullList=subGroupList) %>%
colMutater(fullList=perDayList) %>%
ungroup()
##
## Data will be modified as needed for dates: 18720 18721 18722 18723
vaxPerCap_210717_func_v4_new %>%
select(state, date, vxcgte65_perday, vxc1864_perday, vxc0017_perday) %>%
pivot_longer(-c(state, date)) %>%
mutate(age=stringr::str_replace_all(name, "vxc|_perday", "")) %>%
select(state, date, age, perday=value) %>%
left_join(popState, by=c("state", "age")) %>%
mutate(perday_percap=perday/pop) %>%
arrange(state, date, age) %>%
group_by(state, age) %>%
helperRollingAgg("perday_percap", newName="perday_percap7") %>%
ungroup() %>%
filter(!is.na(perday_percap7), state != "DC") %>%
ggplot(aes(x=date, y=1000*perday_percap7)) +
geom_line(aes(group=state, color=state.region[match(state, state.abb)])) +
labs(x=NULL,
y="Completed per thousand (rolling 7-day)",
title="Newly fully vaccinated by day",
subtitle="Persons vaccinated before age-breakouts included NOT counted"
) +
facet_wrap(~age) +
scale_color_discrete("Census\nRegion")
vaxPerCap_210717_func_v4_new %>%
filter(vxcgte65_perday < 0) %>%
arrange(vxcgte65_perday) %>%
select(state, date, vxcgte65_perday, vxc1864_perday, vxc0017_perday, everything())
## # A tibble: 32 x 21
## state date vxcgte65_perday vxc1864_perday vxc0017_perday vxa vxc
## <chr> <date> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 TX 2021-04-06 -46105. 176079. 2119. 1.33e7 4.82e6
## 2 TX 2021-04-09 -45525. 197402. 3544. 1.42e7 5.28e6
## 3 TX 2021-04-08 -43464. 195529. 3102 1.38e7 5.12e6
## 4 TX 2021-04-07 -42975. 189957. 2673. 1.35e7 4.97e6
## 5 TX 2021-04-05 -42769. 171650. 1931. 1.31e7 4.69e6
## 6 NJ 2021-06-25 -26417 -98167 -1631 9.67e6 4.78e6
## 7 TX 2021-04-04 -23817. 156215. 1826. 1.28e7 4.56e6
## 8 TX 2021-04-03 -22253. 154919. 686. 1.26e7 4.42e6
## 9 PA 2021-03-12 -14990 21557 1007 3.76e6 1.21e6
## 10 TX 2021-05-16 -8737. 88786. 8991. 2.12e7 9.34e6
## # ... with 22 more rows, and 14 more variables: vxcgte65 <dbl>,
## # vxcgte65pct <dbl>, vxcgte18 <dbl>, vxcgte18pct <dbl>, Admin_Per_100k <dbl>,
## # vxcpoppct <dbl>, vxapm <dbl>, vxcpm <dbl>, vxapm7 <dbl>, vxcpm7 <dbl>,
## # vxc1864 <dbl>, vxc0017 <dbl>, vxa_perday <dbl>, vxc_perday <dbl>
vaxPerCap_210717_func_v4_new %>%
filter(vxc1864_perday < 0) %>%
arrange(vxc1864_perday) %>%
select(state, date, vxcgte65_perday, vxc1864_perday, vxc0017_perday, everything())
## # A tibble: 45 x 21
## state date vxcgte65_perday vxc1864_perday vxc0017_perday vxa vxc
## <chr> <date> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 NJ 2021-06-25 -26417 -98167 -1631 9.67e6 4.78e6
## 2 FL 2021-04-06 69450 -44626 0 1.06e7 3.88e6
## 3 MI 2021-04-06 39659 -28104 0 4.94e6 1.95e6
## 4 NY 2021-04-06 85828 -25877 0 1.08e7 4.15e6
## 5 IL 2021-04-06 36950 -25200 0 6.63e6 2.36e6
## 6 OH 2021-04-06 40886 -24029 0 5.94e6 2.25e6
## 7 OR 2021-04-06 34523 -23691 0 2.09e6 8.12e5
## 8 MO 2021-04-06 23747 -20457 0 2.86e6 1.08e6
## 9 PA 2021-04-06 44807 -20157 0 6.80e6 2.45e6
## 10 WA 2021-04-06 30652 -19542 0 4.04e6 1.59e6
## # ... with 35 more rows, and 14 more variables: vxcgte65 <dbl>,
## # vxcgte65pct <dbl>, vxcgte18 <dbl>, vxcgte18pct <dbl>, Admin_Per_100k <dbl>,
## # vxcpoppct <dbl>, vxapm <dbl>, vxcpm <dbl>, vxapm7 <dbl>, vxcpm7 <dbl>,
## # vxc1864 <dbl>, vxc0017 <dbl>, vxa_perday <dbl>, vxc_perday <dbl>
vaxPerCap_210717_func_v4_new %>%
filter(vxc0017_perday < 0) %>%
arrange(vxc0017_perday) %>%
select(state, date, vxcgte65_perday, vxc1864_perday, vxc0017_perday, everything())
## # A tibble: 6 x 21
## state date vxcgte65_perday vxc1864_perday vxc0017_perday vxa vxc
## <chr> <date> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 WV 2021-05-15 1109 4234 -2298 1248326 587339
## 2 NJ 2021-06-25 -26417 -98167 -1631 9667905 4782589
## 3 NH 2021-04-07 4795 5910 -1468 829371 282496
## 4 DC 2021-04-14 501 3176 -68 471911 147799
## 5 AR 2021-06-11 -4 21 -1 2130617 969875
## 6 ND 2021-04-20 18 48 -1 527930 226960
## # ... with 14 more variables: vxcgte65 <dbl>, vxcgte65pct <dbl>,
## # vxcgte18 <dbl>, vxcgte18pct <dbl>, Admin_Per_100k <dbl>, vxcpoppct <dbl>,
## # vxapm <dbl>, vxcpm <dbl>, vxapm7 <dbl>, vxcpm7 <dbl>, vxc1864 <dbl>,
## # vxc0017 <dbl>, vxa_perday <dbl>, vxc_perday <dbl>
vaxPerCap_210717_func_v4_new %>%
filter(state=="TX") %>%
select(date, vxc, vxcgte65, vxcgte18) %>%
mutate(across(where(is.numeric), ~.-lag(.), .names="d_{.col}")) %>%
pivot_longer(-date) %>%
group_by(name) %>%
filter(!is.na(value), lag(value) != 0) %>%
ggplot(aes(x=date, y=value)) +
geom_line(aes(color=name, group=name)) +
facet_wrap(~name, scales="free_y")
vaxPerCap_210717_func_v4_new %>%
filter(state=="TX") %>%
select(date, vxc, vxcgte65, vxcgte18) %>%
mutate(across(where(is.numeric), ~.-lag(.), .names="d_{.col}")) %>%
select(date, starts_with("d_")) %>%
pivot_longer(-date) %>%
group_by(name) %>%
filter(!is.na(value), lag(value) != 0) %>%
ggplot(aes(x=date, y=value)) +
geom_line(aes(color=name, group=name))
vaxPerCap_210717_func_v4_new %>%
filter(state=="FL") %>%
select(date, vxc, vxcgte65, vxcgte18) %>%
mutate(across(where(is.numeric), ~.-lag(.), .names="d_{.col}")) %>%
pivot_longer(-date) %>%
group_by(name) %>%
filter(!is.na(value), lag(value) != 0) %>%
ggplot(aes(x=date, y=value)) +
geom_line(aes(color=name, group=name)) +
facet_wrap(~name, scales="free_y")
vaxPerCap_210717_func_v4_new %>%
filter(state=="FL") %>%
select(date, vxc, vxcgte65, vxcgte18) %>%
mutate(across(where(is.numeric), ~.-lag(.), .names="d_{.col}")) %>%
select(date, starts_with("d_")) %>%
pivot_longer(-date) %>%
group_by(name) %>%
filter(!is.na(value), lag(value) != 0) %>%
ggplot(aes(x=date, y=value)) +
geom_line(aes(color=name, group=name))
There is still work to do with the timing by age cohort, particularly around April 6.